library(ggplot2)
library(tidyverse)
library(lubridate)
Accidents_Original <- readRDS("farsp.RDS")
alcohol <- read.csv("alcohol.csv")
alcohol <- alcohol %>%
mutate(description = if_else(description %in% c("Unknown", "Not reported"), NA, description))
body_typ <- read.csv("body_typ.csv")
body_typ <- body_typ %>%
mutate(description = if_else(description %in% c("Unknown body type", "Not Reported"), NA, description))
col_names <- read.csv("column_name_description.csv")
inj_sev <- read.csv("inj_sev.csv")
inj_sev <- inj_sev %>%
mutate(description = if_else(description %in% c("Unknown/Not Reported", "N/A"), NA, description))
man_coll <- read.csv("man_coll.csv")
man_coll <- man_coll %>%
mutate(description = if_else(description %in% c("Not Reported", "Unknown"), NA, description))
sex <- read.csv("sex.csv")
sex <- sex %>%
mutate(description = if_else(description %in% c("not reported", "unknown"), NA, description))
state_code <- read.csv("state_code.csv")
col_names <- setNames(col_names$column_name, col_names$description)
names(col_names) <- gsub(" ", "_", names(col_names))
Names_Changed <- Accidents_Original %>%
rename(alcohol = drinking) %>%
left_join(state_code, by = c("state" = "state_code")) %>%
left_join(sex, by = "sex") %>%
left_join(man_coll, by = "man_coll") %>%
left_join(inj_sev, by = "inj_sev") %>%
left_join(body_typ, by = "body_typ") %>%
left_join(alcohol, by = "alcohol") %>%
mutate(across(c(year, month, day, hour, minute, age), ~replace(., . == 99, NA)),
across(c(age), ~replace(., . == 999, NA))) %>%
drop_na(c("year", "month", "day", "hour", "minute")) %>%
mutate(
state = as.character(state_name),
sex = as.character(description.x),
man_coll = as.character(description.y),
alcohol = as.character(description),
inj_sev = as.character(description.x.x),
body_typ = as.character(description.y.y),
Datetime = ymd_hm(paste(year, month, day, hour, minute)),
Month_Name = month(month, label = TRUE)
) %>%
rename(!!!col_names) %>%
mutate(
State = as.factor(State),
County = as.factor(County),
Manner_of_collision = as.factor(Manner_of_collision),
Vehicle_body_type = as.factor(Vehicle_body_type),
Sex = as.factor(Sex),
Injury_severity = factor(Injury_severity, levels = c("No Apparent Injury", "Injured, Severity Unknown", "Possible Injury", "Suspected Minor Injury", "Suspected Serious Injury", "Non-incapacitating injury", "Fatal Injury", "Died Prior to Crash")),
Injury_Severity_Basic = if_else(Injury_severity %in% c("Fatal Injury", "Died Prior to Crash"), "Fatal", "Non-Fatal"),
Datetime = as.POSIXct(Datetime),
Date = as.Date(Datetime),
Accident_weekday = factor(wday(Datetime, label = TRUE), levels = c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat")),
Year = year(Datetime)
) %>%
filter(Age < 100) %>%
select(-State, -description.x, -description.y, -description.x.x, -description.y.y,-description)
ggplot(Names_Changed %>% filter(!is.na(state_name)), aes(x = state_name)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Number of Accidents per State")
ggplot(Names_Changed %>% filter(!is.na(Datetime)), aes(x = month(Datetime, label = TRUE))) +
geom_bar() +
ggtitle("Number of Accidents per Month")
ggplot(Names_Changed, aes(x = Date, y = after_stat(count))) +
geom_line(stat = "count") +
labs(x = "Date", y = "Accidents", title = "Accident Rates Over Time")
Graph 4: Number of Accidents per Hour
ggplot(Names_Changed %>% filter(!is.na(Accident_hour)), aes(x = Accident_hour)) +
geom_bar() +
ggtitle("Number of Accidents per Hour")
ggplot(Names_Changed %>% filter(!is.na(Age)), aes(x = Age)) +
geom_histogram(binwidth = 1) +
ggtitle("Distribution of Age of Persons Involved in Accidents")
ggplot(Names_Changed %>% filter(!is.na(Sex)), aes(x = Sex)) +
geom_bar() +
ggtitle("Number of Accidents by Sex")
ggplot(Names_Changed %>% filter(!is.na(Alcohol_involvement)), aes(x = Alcohol_involvement)) +
geom_bar() +
ggtitle("Number of Accidents with Alcohol Involvement")
Graph 8: Distribution of Injury Severity
ggplot(Names_Changed %>% filter(!is.na(Injury_severity)), aes(x = Injury_severity)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Distribution of Injury Severity")
ggplot(Names_Changed %>% filter(!is.na(Manner_of_collision)), aes(x = Manner_of_collision)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Number of Accidents by Manner of Collision")
accidents_by_body_type_injury_sex <- Names_Changed %>%
filter(!is.na(Vehicle_body_type), !is.na(Injury_severity)) %>%
group_by(Vehicle_body_type, Injury_severity, Sex, Alcohol_involvement) %>%
summarize(n = n()) %>%
mutate(Vehicle_body_type = case_when(
grepl("sedan|coupe|convertible|hatchback|automobile|compact", Vehicle_body_type, ignore.case = TRUE) ~ "Automobiles",
grepl("motorcycle|scooter|moped", Vehicle_body_type, ignore.case = TRUE) ~ "Motorcycles and Scooters",
grepl("van|minivan", Vehicle_body_type, ignore.case = TRUE) ~ "Vans and Minivans",
grepl("pickup|utility|truck", Vehicle_body_type, ignore.case = TRUE) ~ "Trucks",
grepl("bus", Vehicle_body_type, ignore.case = TRUE) ~ "Buses",
grepl("ATV|snowmobile|golf cart|off-highway", Vehicle_body_type, ignore.case = TRUE) ~ "Specialized Vehicles",
TRUE ~ "Unknown/Other Vehicle Types"
)) %>%
filter(n > 4000) %>%
mutate(Injury_severity = factor(Injury_severity, levels = c("No Apparent Injury", "Possible Injury", "Suspected Minor Injury", "Suspected Serious Injury", "Fatal Injury", "Injured, Severity Unknown", "Died Prior to Crash"), ordered = TRUE))
## `summarise()` has grouped output by 'Vehicle_body_type', 'Injury_severity',
## 'Sex'. You can override using the `.groups` argument.
ggplot(accidents_by_body_type_injury_sex, aes(x = Vehicle_body_type)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Number of Accidents by Vehicle Body Type")
ggplot(Names_Changed, aes(x = Age)) +
geom_histogram(binwidth = 1) +
labs(x = "Age", y = "Count", title = "Age Distribution of Persons Involved in Accidents")
ggplot(Names_Changed, aes(x = Injury_severity, y = Age)) +
geom_boxplot() +
labs(x = "Injury Severity", y = "Age", title = "Age Distribution by Injury Severity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(Names_Changed, aes(x = Manner_of_collision)) +
geom_bar() +
labs(x = "Manner of Collision", y = "Count", title = "Manner of Collision Distribution") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
accidents_by_hour_month <- Names_Changed %>%
group_by(Accident_hour, Month_Name) %>%
summarize(Count = n()) %>%
ungroup()
## `summarise()` has grouped output by 'Accident_hour'. You can override using the
## `.groups` argument.
ggplot(accidents_by_hour_month, aes(x = Accident_hour, y = Count, fill = factor(Month_Name))) +
geom_col(position = "stack") +
labs(title = "Number of Accidents by Hour of the Day and Month") +
scale_fill_discrete(name = "Hour of the Day") +
theme_minimal()
ggplot(Names_Changed, aes(x = Age, y = Number_of_persons_involved)) +
geom_point(alpha = 0.3) +
labs(x = "Age", y = "Number of persons involved", title = "Relationship between Age and Number of Persons Involved in Accidents")
ggplot(Names_Changed, aes(x = Age, y = Number_of_persons_involved)) +
geom_point(alpha = 0.3) +
labs(x = "Age", y = "Number of persons involved", title = "Relationship between Age and Number of Persons Involved in Accidents")
ggplot(Names_Changed, aes(x = Accident_hour, y = Accident_day)) +
geom_bin2d(bins = 20) +
labs(x = "Hour of day", y = "Day of month", fill = "Number of accidents", title = "Density of Accidents by Day of Month and Time of Day")
ggplot(Names_Changed, aes(x = Accident_weekday)) +
geom_bar() +
labs(x = "Day of week", y = "Number of accidents", title = "Total Number of Accidents by Day of the Week")
ggplot(Names_Changed, aes(x = Accident_year)) +
geom_bar() +
labs(x = "Year", y = "Number of accidents", title = "Total Number of Accidents by Year")
# ggplot(Names_Changed, aes(x = Accident_hour, y = Age)) +
# geom_point(alpha = 0.003) +
# labs(x = "Hour of day", y = "Age", title = "Relationship between Age and Time of Day When Accidents Occur")
ggplot(Names_Changed, aes(x = Accident_hour, y = Age)) +
geom_point(alpha = 0.003) +
labs(x = "Hour of day", y = "Age", title = "Relationship between Age and Time of Day When Accidents Occur") +
scale_alpha(trans = "log", range = c(0.005, 0.5))
ggplot(accidents_by_body_type_injury_sex, aes(x = Vehicle_body_type, fill = Injury_severity)) +
geom_bar(position = "stack") +
scale_fill_brewer(type = "div", palette = "RdYlBu", direction = -1) +
labs(x = "Vehicle body type", y = "Number of accidents", title = "Number of Accidents by Vehicle Body Type and Injury Severity")+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(Names_Changed, aes(x = Sex, y = Age)) +
geom_boxplot() +
labs(x = "Sex", y = "Age", title = "Distribution of Age by Sex Involved in Accidents")
ggplot(Names_Changed, aes(x = Month_Name, fill = Injury_severity)) +
geom_bar() +
labs(x = "Month", y = "Number of accidents", title = "Number of Accidents by Month and Injury Severity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(subset(Names_Changed, Number_of_persons_involved < 10), aes(x = Number_of_persons_involved)) +
geom_density() +
labs(x = "Number of persons involved", y = "Density", title = "Distribution of Number of Persons Involved in Accidents")
ggplot(Names_Changed, aes(x = Manner_of_collision, y = Age)) +
geom_boxplot() +
labs(x = "Manner of collision", y = "Age", title = "Distribution of Age by Manner of Collision")
accidents_by_hour_weekday <- Names_Changed %>%
group_by(hour = hour(Datetime), weekday = wday(Datetime, label = TRUE)) %>%
summarize(n = n())
## `summarise()` has grouped output by 'hour'. You can override using the
## `.groups` argument.
ggplot(accidents_by_hour_weekday, aes(x = hour, y = weekday, fill = n)) +
geom_tile() +
scale_fill_gradient(low = "#f7f7f7", high = "#e31a1c") +
labs(x = "Hour of day", y = "Day of week", fill = "Number of accidents", title = "Heatmap of Accidents by Hour and Day of the Week")
ggplot(accidents_by_body_type_injury_sex, aes(x = Vehicle_body_type, y = n, fill = Injury_severity))+
geom_col(position = "dodge") +
facet_wrap(~ Sex, scales = "free_y") +
scale_fill_brewer(type = "div", palette = "RdYlBu", direction = -1) +
labs(x = "Vehicle body type", y = "Number of accidents", fill = "Injury severity", title = "Accidents by Vehicle Body Type, Injury Severity, and Sex")+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(Names_Changed %>% filter(!is.na(Sex)), aes(x = Accident_hour, y = Age, color = Injury_Severity_Basic)) +
geom_point(alpha = 0.005) +
facet_wrap(~ Sex) +
labs(x = "Hour of day", y = "Age", color = "Injury severity", title = "Relationship between Hour of Day and Age of Persons Involved in Accidents, Colored by Injury Severity and Faceted by Sex")+
guides(color = guide_legend(override.aes = list(alpha = 1)))
accidents_per_day <- Names_Changed %>%
mutate(DoY = yday(Datetime), Month = month(Datetime), Day = day(Datetime), Hour = hour(Datetime)) %>%
group_by(Month, Day, Hour) %>%
summarize(count = n()) %>%
ungroup()
## `summarise()` has grouped output by 'Month', 'Day'. You can override using the
## `.groups` argument.
ggplot(accidents_per_day, aes(x = Day, y = Month, fill = count)) +
geom_tile(color = "white") +
scale_y_continuous(trans = 'reverse', breaks = 1:12, labels = month.abb) +
labs(x = "Day", y = "Month", fill = "Accidents", title = "Visualizing the Frequency of Accidents Throughout the Year") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(accidents_per_day, aes(x = Day, y = Hour, fill = count - mean(count))) +
geom_tile(width = 0.5) +
scale_y_reverse() +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
facet_wrap(~ month(Month, label = TRUE)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0, hjust = 1)) +
labs(x = "Day", y = "Time", fill = "Accidents", title = "Analyzing the Relationship Between Time and Accidents Across Different Months")
ggplot(data = Names_Changed, aes(x = Age, y = Number_of_persons_involved)) +
geom_point(alpha = 0.05) +
theme_minimal()
Scatter plot showing the relationship between the age of individuals involved in accidents and the number of persons involved.
monthly_accidents <- Names_Changed %>%
mutate(Month = floor_date(Datetime, "month")) %>%
count(Month)
ggplot(monthly_accidents, aes(x = Month, y = n)) +
geom_line() +
theme_minimal() +
labs(x = "Year", y = "Number of Accidents")
Seasonal decomposition plot showing the trend of accidents over time, highlighting any seasonality in the data.
ggplot(Names_Changed, aes(x= Number_of_vehicles_involved, y = Number_of_persons_involved))+
geom_point(alpha = 0.1)
Scatter plot showing the relationship between the number of vehicles and the number of persons involved in accidents.
ggplot(data = Names_Changed, aes(x = Age)) +
geom_density(fill = "lightblue", alpha = 0.6) +
theme_minimal() +
labs(x = "Age", y = "Density")
Density plot showing the distribution of age of individuals involved in accidents.
ggplot(data = Names_Changed, aes(x = Age, y = Number_of_persons_involved)) +
geom_point(alpha = 0.2) +
facet_wrap(~ Sex) +
theme_minimal() +
labs(x = "Age", y = "Number of Persons Involved")
Scatter plot showing the relationship between age and number of persons involved in accidents, separated by sex.
ggplot(data = Names_Changed, aes(x = Sex, y = Age, fill = Sex)) +
geom_violin() +
theme_minimal() +
labs(x = "Sex", y = "Age")
Violin plot showing the distribution of age of individuals involved in accidents, separated by sex.
ggplot(data = Names_Changed, aes(x = Age, y = Number_of_persons_involved)) +
geom_hex(bins = 30) +
theme_minimal() +
labs(x = "Age", y = "Number of Persons Involved")
Hexbin plot showing the density of points in a hexagonal grid, with the x-axis representing age and y-axis representing the number of persons involved in accidents.
library(ggridges)
ggplot(data = Names_Changed, aes(x = Age, y = Injury_severity, fill = Injury_severity)) +
geom_density_ridges() +
theme_minimal() +
labs(x = "Age", y = "Injury Severity")
## Picking joint bandwidth of 2.5
Ridge plot showing the distribution of age of individuals involved in accidents, separated by injury severity.
ggplot(data = Names_Changed, aes(x = Age, y = ..count.., fill = Injury_severity)) +
stat_density(geom = "line") +
scale_fill_discrete(name = "Injury Severity") +
labs(x = "Age", y = "Count of Rows") +
theme_minimal()+
facet_wrap(~ Injury_severity, scales = "free")+
xlim(0, 100)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
Ridge plot showing the distribution of age of individuals involved in accidents, separated by injury severity.
accidents_by_body_type_injury_sex to make sure this is
working properly)ggplot(data = accidents_by_body_type_injury_sex, aes(x = Vehicle_body_type, y = n)) +
geom_col() +
facet_wrap(~ Alcohol_involvement) +
theme_minimal() +
labs(x = "Vehicle Body Type", y = "Number of Accidents") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Bar plot showing the number of accidents by vehicle body type, faceted by alcohol involvement.
ggplot(data = Names_Changed, aes(x = Accident_hour, y = Number_of_persons_involved, color = Sex)) +
geom_jitter(alpha = 0.1, width = 0.2, height = 0.2) +
theme_minimal() +
labs(x = "Hour of the Day", y = "Number of Persons Involved")
Scatter plot showing the relationship between the hour of the day and the number of persons involved in accidents, colored by sex.
state_injury_severity <- Names_Changed %>% count(state_name, Injury_severity)
ggplot(data = state_injury_severity, aes(x = state_name, y = Injury_severity, size = n, color = Injury_severity)) +
geom_point(alpha = 0.5) +
theme_minimal() +
labs(x = "State", y = "Injury Severity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Bubble plot showing the number of accidents by state and injury severity, with bubble size indicating the number of accidents and color indicating injury severity.
monthly_accidents <- Names_Changed %>% count(Year = year(Datetime), Month = month(Datetime))
ggplot(data = monthly_accidents, aes(x = month(Month, label = TRUE), y = n, fill = as.factor(Year))) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(x = "Month", y = "Number of Accidents", fill = "Year")
Bar plot showing the number of accidents by month and year.
Graph 43: Ridgeline plot of the number of accidents by state and hour. RI and DC have some issues with early morning accidents!
ggplot(Names_Changed, aes(x = Accident_hour, y = state_name)) +
ggridges::geom_density_ridges() +
labs(title = "Ridgeline Plot: Accident Hour", x = "Hour", y = "State") +
theme_ridges()
## Picking joint bandwidth of 0.819
Ridgeline plot of number of accidents by state and hour
ggplot(Names_Changed, aes(x = Injury_severity, y = Age, fill = Injury_severity)) +
geom_boxplot() +
labs(title = "Boxplot of Age by Injury Severity", x = "Injury Severity", y = "Age") +
theme_minimal()
Boxplot of Age by Injury Severity
ggplot(Names_Changed, aes(sample = Age)) +
geom_qq() +
geom_qq_line(color = "red") +
labs(title = "Q-Q Plot of Age", x = "Theoretical Quantiles", y = "Sample Quantiles") +
theme_minimal()
# ggplot(Names_Changed, aes(x = Age, y = Number_of_persons_involved)) +
# geom_point(alpha = 0.2, aes(color = Sex)) +
# geom_smooth(method = "loess", se = FALSE, color = "red") +
# labs(title = "Scatterplot of Age vs. Number of Persons Involved with LOESS fit", x = "Age", y = "Number of Persons Involved") +
# theme_minimal()
# # Fit a linear model
# model <- lm(Number_of_persons_involved ~ Age, data = Names_Changed)
#
# # Create a residual plot
# ggplot(data.frame(residuals = resid(model), fitted = fitted(model)), aes(x = fitted, y = residuals)) +
# geom_point(alpha = 0.2) +
# geom_smooth(se = FALSE, color = "red", method = "loess") +
# labs(title = "Residual Plot", x = "Fitted Values", y = "Residuals") +
# theme_minimal()
ggplot(Names_Changed, aes(x = Age)) +
geom_density(fill = "steelblue", alpha = 0.5) +
labs(title = "Density Plot of Age", x = "Age", y = "Density") +
theme_minimal()
Names_Changed <- Names_Changed %>%
mutate(region = ifelse(state_name %in% c("Maine", "New Hampshire", "Vermont", "Massachusetts", "Rhode Island", "Connecticut"), "Northeast",
ifelse(state_name %in% c("New York", "Pennsylvania", "New Jersey"), "Northeast",
ifelse(state_name %in% c("Wisconsin", "Michigan", "Illinois", "Indiana", "Ohio"), "Midwest",
ifelse(state_name %in% c("Minnesota", "Iowa", "Missouri", "North Dakota", "South Dakota", "Nebraska", "Kansas"), "Midwest",
ifelse(state_name %in% c("Delaware", "Maryland", "District of Columbia", "Virginia", "West Virginia", "North Carolina", "South Carolina", "Georgia", "Florida"), "South",
ifelse(state_name %in% c("Kentucky", "Tennessee", "Mississippi", "Alabama", "Oklahoma", "Texas", "Arkansas", "Louisiana"), "South",
ifelse(state_name %in% c("Montana", "Idaho", "Wyoming", "Colorado", "New Mexico", "Arizona", "Utah", "Nevada"), "West",
ifelse(state_name %in% c("Washington", "Oregon", "California", "Alaska", "Hawaii"), "West", NA_character_)))))))))
SingleWindow <- function(statistic, data) {
if (!is.data.frame(data)) {
stop("The input data must be a dataframe.")
}
switch(statistic,
"correlation" = list(Value = cor(data[,2], data[,3]), Midpoint = data[ceiling(nrow(data)/2), 1]),
"p value" = list(Value = cor.test(data[,2], data[,3])$p.value, Midpoint = data[ceiling(nrow(data)/2), 1]),
"OLS" = list(Value = summary(lm(data[,3] ~ data[,2]))$coefficients, Midpoint = data[ceiling(nrow(data)/2), 1]),
stop("The input statistic must be one of 'correlation', 'p value', or 'OLS'.")
)
}
MovingWindow <- function(data, posix_col, col1, col2, window_days, step_size, statistic) {
# Ensure the input POSIX column is in POSIXct format
data[[posix_col]] <- as.POSIXct(data[[posix_col]])
# Calculate the minimum and maximum POSIX values in the dataset
min_date <- min(data[[posix_col]])
max_date <- max(data[[posix_col]])
# Convert window_days and step_size to seconds
window_seconds <- window_days * 24 * 60 * 60
step_seconds <- step_size * 24 * 60 * 60
# Set the starting date for the moving window
current_date <- min_date
results = list()
# Iterate through the dataset using the moving window
while (current_date + window_seconds <= max_date) {
# Filter the data within the moving window
window_data <- data[data[[posix_col]] >= current_date & data[[posix_col]] <= current_date + window_seconds, c(posix_col, col1, col2)]
# Select the posix_col, col1, and col2 using dplyr and set it equal to win_val
win_val <- window_data %>%
select(all_of(c(posix_col, col1, col2)))
# Print the win_val dataframe for the current window
# print(colnames(win_val))
result <- SingleWindow(statistic, win_val)
results <- rbind(results, data.frame(DateTime = current_date,
Value = result$Value[1],
Midpoint = as.POSIXct(result$Midpoint$Datetime)))
# Move the window by step_size
current_date <- current_date + step_seconds
}
# print(results)
ggplot(data = results, aes(x=Midpoint,Value))+
geom_point()
}
MovingWindow(Names_Changed, "Datetime", "Number_of_persons_involved", "Number_of_vehicles_involved", 30, 15, "correlation")